Prep

#' DEAprep
#'
#' Generates control samples out of the average values over all supplied samples, combines spliced and unspliced
#' assays and calculates their logCPM & log2FC data. 
#' All is assembled into one SE object. 
#' 
#'
#' @param se SummarizedExperiment object containing assays of raw counts of spliced & unspliced tx
#'
#' @return an SE object of combined spliced and unspliced data as well as control samples generated from averaging over all treatment samples
#' 
DEAprep <- function(se){
  
  # allocation
  
  e1 <- assays(se)$spliced
  e2 <- assays(se)$unspliced
  readtype1 <- "spliced"
  readtype2 <- "unspliced"
  
  
  # generate control
  
  ## create logcpm assays for both spliced & unspliced assays
  assays(se)$logcpm.spliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$spliced))))
  assays(se)$logcpm.unspliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$unspliced))))

  ## generate a 'control' sample out of the median normalized counts over all samples
  e.ctrl1 <- genCTRL(se, readtype1, "logcpm.spliced")
  e.ctrl2 <- genCTRL(se, readtype2, "logcpm.unspliced")
  
  ## add control samples to assays & generate DGEList object
  dds1 <- DGEList(cbind(e1, e.ctrl1))
  dds2 <- DGEList(cbind(e2, e.ctrl2))
  
  ## combine the spliced & unspliced assays
  dds <- cbind(dds1, dds2)

  
  # generate colData
  
  dd1 <- rbind(colData(se)[,c("Batch","miRNA","Cell_Line")], 
               data.frame(Batch=unique(se$Batch), miRNA="CTRL", Cell_Line=unique(se$Cell_Line)))
  dd1$readtype <- readtype1
  
  ## duplicate dd to have data for combined spliced & unspliced assay
  dd <- rbind(dd1, dd1)
  dd$readtype[(nrow(dd1)+1):nrow(dd)] <- readtype2
  dd$readtype <- as.factor(dd$readtype)
  
  ## rename both dds & dd object features
  n.cols1 <- sapply(colnames(dds1), FUN=function(x)
      var_name <- paste(x, readtype1, sep=".") )
  n.cols2 <- sapply(colnames(dds2), FUN=function(x)
      var_name <- paste(x, readtype2, sep=".") )
  
  colnames(dds) <- c(n.cols1, n.cols2)
  rownames(dd) <- c(n.cols1, n.cols2)
  
  
  # rebuild SE object
  
  ## SE object with logCPM assay
  se <- SummarizedExperiment(assays=list(counts=dds$counts), 
                             rowData=rowData(se), 
                             colData=dd) 
  assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
  
  return(se)
}

PCA

HeLa

HEK

DEA

#' exonDEA
#'
#' @param se SE object containing assays with combined spliced & unspliced counts
#' @param model function: model matrix function to test
#' @param model0 function: model matrix function to be tested against
#' @param control string: name of control samples inside colData() of supplied SE object
#'
#' @return SE object containing DEA results over all treatments & over individual ones
#' 
exonDEA <- function(se, model, model0=~1, control){
  
  ## allocation
  se$miRNA <- relevel(droplevels(se$miRNA), ref=control)
  se$readtype <- relevel(se$readtype, ref="unspliced")
  se$Batch <- droplevels(se$Batch)
  dd <- colData(se)
  ## normalization
  dds <- calcNormFactors(DGEList(assays(se)$counts))
  ## models
  mm <- model.matrix(model, data=dd)
  mm0 <- model.matrix(model0, data=dd)
  testCoeff <- setdiff(colnames(mm), colnames(mm0))
  ## estimate dispersion
  dds <- estimateDisp(dds,mm)
  ## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
  fit <- glmFit(dds, mm)
  ## fit a GLM on the data
  lrt.comb <- glmLRT(fit, testCoeff)
  ## top genes that change relative to stage 0
  res.comb <- as.data.frame(topTags(lrt.comb, Inf))
  ## fit linear model dropping one sample at a time (using multiple cores)
  res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
    as.data.frame(topTags(glmLRT(fit, x),Inf))
  })
  dea.names <- gsub("readtype","", testCoeff)
  dea.names <- make.names(gsub(":miRNA",".", dea.names))
  colnames(res.comb)[grepl("logFC",colnames(res.comb))] <- paste0("logFC.", dea.names)
  names(res.list) <- paste0("DEA.",dea.names)
  
  ## add DEAs
  rowData(se)$DEA.spliced.all <- DataFrame(res.comb[rownames(se),])
  for(i in paste0("DEA.",dea.names)){
    rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
  }

  ## add logFC assay
  se <- plgINS::svacor(se, form = model, form0 = model0)
  se <- SEtools::log2FC(se, controls = se$miRNA==control, by = se$readtype, fromAssay = "corrected", isLog = TRUE)
  
  return(se)
}
## Number of significant surrogate variables is:  6 
## Iteration (out of 5 ):1  2  3  4  5
##      DEA.spliced.all   DEA.spliced.let.7a    DEA.spliced.lsy.6 
##                 1308                   32                   44 
##    DEA.spliced.miR.1  DEA.spliced.miR.124  DEA.spliced.miR.137 
##                  242                  293                  115 
##  DEA.spliced.miR.139  DEA.spliced.miR.143  DEA.spliced.miR.144 
##                    4                    5                    0 
##  DEA.spliced.miR.153  DEA.spliced.miR.155  DEA.spliced.miR.182 
##                   15                   77                   77 
## DEA.spliced.miR.199a  DEA.spliced.miR.204  DEA.spliced.miR.205 
##                   62                   19                   14 
## DEA.spliced.miR.216b  DEA.spliced.miR.223    DEA.spliced.miR.7 
##                   19                   58                  181
## Number of significant surrogate variables is:  6 
## Iteration (out of 5 ):1  2  3  4  5
##      DEA.spliced.all  DEA.spliced.miR.122  DEA.spliced.miR.133 
##                 1814                   36                   78 
##  DEA.spliced.miR.138  DEA.spliced.miR.145  DEA.spliced.miR.184 
##                  177                   85                   10 
## DEA.spliced.miR.190a DEA.spliced.miR.200b DEA.spliced.miR.216a 
##                   99                  152                   25 
##  DEA.spliced.miR.217 DEA.spliced.miR.219a  DEA.spliced.miR.375 
##                   38                   79                  107 
## DEA.spliced.miR.451a 
##                   21
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
##      DEA.spliced.all   DEA.spliced.let.7a    DEA.spliced.lsy.6 
##                 6347                  433                  244 
##    DEA.spliced.miR.1  DEA.spliced.miR.124  DEA.spliced.miR.137 
##                 1441                 1286                  581 
##  DEA.spliced.miR.139  DEA.spliced.miR.143  DEA.spliced.miR.144 
##                   38                   64                   27 
##  DEA.spliced.miR.153  DEA.spliced.miR.155  DEA.spliced.miR.182 
##                  175                  369                  304 
## DEA.spliced.miR.199a  DEA.spliced.miR.204  DEA.spliced.miR.205 
##                  231                  101                   73 
## DEA.spliced.miR.216b  DEA.spliced.miR.223    DEA.spliced.miR.7 
##                  183                  306                  602
## Number of significant surrogate variables is:  5 
## Iteration (out of 5 ):1  2  3  4  5
##      DEA.spliced.all  DEA.spliced.miR.122  DEA.spliced.miR.133 
##                 2746                   58                  134 
##  DEA.spliced.miR.138  DEA.spliced.miR.145  DEA.spliced.miR.184 
##                  247                  152                   23 
## DEA.spliced.miR.190a DEA.spliced.miR.200b DEA.spliced.miR.216a 
##                  148                  210                   50 
##  DEA.spliced.miR.217 DEA.spliced.miR.219a  DEA.spliced.miR.375 
##                   79                  106                  162 
## DEA.spliced.miR.451a 
##                   34
## Number of significant surrogate variables is:  9 
## Iteration (out of 5 ):1  2  3  4  5
##   DEA.spliced.all   DEA.miRNAlet.7a    DEA.miRNAlsy.6    DEA.miRNAmiR.1 
##             10952               926               129              2585 
##  DEA.miRNAmiR.124  DEA.miRNAmiR.137  DEA.miRNAmiR.139  DEA.miRNAmiR.143 
##              2139              1923                 4                 9 
##  DEA.miRNAmiR.144  DEA.miRNAmiR.153  DEA.miRNAmiR.155  DEA.miRNAmiR.182 
##                23               770               222               134 
## DEA.miRNAmiR.199a  DEA.miRNAmiR.204  DEA.miRNAmiR.205 DEA.miRNAmiR.216b 
##               376               103               120                85 
##  DEA.miRNAmiR.223    DEA.miRNAmiR.7 
##               238               764
## Number of significant surrogate variables is:  11 
## Iteration (out of 5 ):1  2  3  4  5
##   DEA.spliced.all  DEA.miRNAmiR.122  DEA.miRNAmiR.133  DEA.miRNAmiR.138 
##             11571               246               212              3030 
##  DEA.miRNAmiR.145  DEA.miRNAmiR.184 DEA.miRNAmiR.190a DEA.miRNAmiR.200b 
##               345                15               452               701 
## DEA.miRNAmiR.216a  DEA.miRNAmiR.217 DEA.miRNAmiR.219a  DEA.miRNAmiR.375 
##               109               246               207               533 
## DEA.miRNAmiR.451a 
##                21
## Number of significant surrogate variables is:  6 
## Iteration (out of 5 ):1  2  3  4  5
##      DEA.spliced.all      DEA.miRNAlet.7a       DEA.miRNAlsy.6 
##                 7789                   37                    2 
##       DEA.miRNAmiR.1     DEA.miRNAmiR.124     DEA.miRNAmiR.137 
##                  484                  239                  206 
##     DEA.miRNAmiR.139     DEA.miRNAmiR.143     DEA.miRNAmiR.144 
##                    0                    0                    0 
##     DEA.miRNAmiR.153     DEA.miRNAmiR.155     DEA.miRNAmiR.182 
##                  130                   11                   10 
##    DEA.miRNAmiR.199a     DEA.miRNAmiR.204     DEA.miRNAmiR.205 
##                   35                    7                    1 
##    DEA.miRNAmiR.216b     DEA.miRNAmiR.223       DEA.miRNAmiR.7 
##                    2                    5                   55 
##   DEA.spliced.let.7a    DEA.spliced.lsy.6    DEA.spliced.miR.1 
##                   32                   44                  242 
##  DEA.spliced.miR.124  DEA.spliced.miR.137  DEA.spliced.miR.139 
##                  293                  115                    4 
##  DEA.spliced.miR.143  DEA.spliced.miR.144  DEA.spliced.miR.153 
##                    5                    0                   15 
##  DEA.spliced.miR.155  DEA.spliced.miR.182 DEA.spliced.miR.199a 
##                   77                   77                   62 
##  DEA.spliced.miR.204  DEA.spliced.miR.205 DEA.spliced.miR.216b 
##                   19                   14                   19 
##  DEA.spliced.miR.223    DEA.spliced.miR.7 
##                   58                  181
## Number of significant surrogate variables is:  6 
## Iteration (out of 5 ):1  2  3  4  5
##      DEA.spliced.all     DEA.miRNAmiR.122     DEA.miRNAmiR.133 
##                 9699                    2                   31 
##     DEA.miRNAmiR.138     DEA.miRNAmiR.145     DEA.miRNAmiR.184 
##                  738                   12                    3 
##    DEA.miRNAmiR.190a    DEA.miRNAmiR.200b    DEA.miRNAmiR.216a 
##                   16                   55                    3 
##     DEA.miRNAmiR.217    DEA.miRNAmiR.219a     DEA.miRNAmiR.375 
##                  103                   15                  136 
##    DEA.miRNAmiR.451a  DEA.spliced.miR.122  DEA.spliced.miR.133 
##                    0                   36                   78 
##  DEA.spliced.miR.138  DEA.spliced.miR.145  DEA.spliced.miR.184 
##                  177                   85                   10 
## DEA.spliced.miR.190a DEA.spliced.miR.200b DEA.spliced.miR.216a 
##                   99                  152                   25 
##  DEA.spliced.miR.217 DEA.spliced.miR.219a  DEA.spliced.miR.375 
##                   38                   79                  107 
## DEA.spliced.miR.451a 
##                   21

Plots

Stats

up-/downregulations at different significance levels

Investigate upregulated spliced tx

Analysis

# Check if log2FC upregulations are due to skewed controls 


ctrlAnalysis <- function(se, pert, TS, fc.degree){
  
  ## vector of Bartel miRNA treatments and their corresponding seeds (based on actual miRNA seqs)
  pert.seed <- sapply(se$miRNA[se$miRNA!="CTRL" & se$readtype=="spliced"], function(x){
    as.character(pert$seed[grepl(paste0(x,"$"), pert$treatment) | grepl(paste0(x,"\\("), pert$treatment)])
    })
  names(pert.seed) <- se$miRNA[se$miRNA!="CTRL" & se$readtype=="spliced"]
  
  ## adjusted gene names to be identical to TS names
  fc.genes <- as.character(rowData(se)$gene_name)
  ## only consider common genes
  id <- fc.genes %in% TS$feature
  ## significant ids
  id.sig <- id & rowData(se)$DEA.spliced.all$FDR < .05
  
  ## aggregate logFC by genes
  logfc <- cbind(as.data.frame(assays(se)$log2FC),fc.genes)
  logfc <- aggregate(logfc[,colnames(logfc)!="fc.genes"], by=list(logfc$fc.genes), FUN="sum")
  rownames(logfc) <- logfc$Group.1
  logfc.sign <- logfc.sign.deg <- sign(logfc[,colnames(logfc)!="Group.1"])
  
  ## update gene names, DEA and IDs: aggregate by genes
  ### aggregate DEA FDR info
  fdr <- rowData(se)$DEA.spliced.all
  rownames(fdr) <- fc.genes
  fdr <- aggregate(fdr[fc.genes[id],"FDR"], by=list(rownames(fdr[fc.genes[id],])), FUN="min")
  rownames(fdr) <- fdr$Group.1
  colnames(fdr) <- c("genes","FDR")
  ### adjusted gene names to be identical to TS names
  fc.genes <- rownames(logfc)
  ### only consider common genes
  id <- fc.genes %in% TS$feature
  ### significant ids
  id.sig <- fc.genes %in% rownames(fdr)[fdr$FDR<.05]
  
  ## for splicing: only genes common with TS & with logfc createer than 'fc.degree' & significant FDR
  logfc.sign.deg[logfc[,colnames(logfc)!="Group.1"] < fc.degree] <- 0
  logfc.sign.deg <- logfc.sign.deg[rownames(logfc.sign.deg) %in% fc.genes[id.sig] & rowSums(logfc.sign.deg)>0, se$miRNA!="CTRL"]
  ## for control: only genes common with TS
  logfc.sign <- logfc.sign[rownames(logfc.sign) %in% fc.genes[id], se$miRNA!="CTRL"]
             
  
  ## to find treatment miRNA for each gene for which the logFC are upregulated
  nonUpregSeeds <- function(readtype, logfc, pert.seed){
    l <- lapply(readtype, function(i){
      apply(logfc[,readtype==i], 1, function(x){
        pert.seed[x!=1]
        })
      })
    return(l)
  }
  
  ## splicing: find treatment miRNAs potentially targeting each gene
  n <- nonUpregSeeds(unique(se$readtype), logfc.sign.deg, pert.seed)
  names(n) <- unique(se$readtype)
  ## control: background distribution of upregulation instances for each gene
  m <- nonUpregSeeds(unique(se$readtype), logfc.sign, pert.seed)
  names(m) <- paste0("ctrl.",unique(se$readtype))
  ## combine
  n <- c(n,m)
  
  
  ## for each common gene select the miRNA families that target them (using TargetScan data)
  ts.all <- lapply(fc.genes[id], function(x){
    TS$family[TS$feature==x]
    })
  names(ts.all) <- fc.genes[id]
  ts.list <- list(splicing=ts.all[fc.genes[id.sig]], ctrl=ts.all[fc.genes[id]])
  
  ## ratio of treatment miRNA are in the TS target list
  ratios <- lapply(n, function(x){
    if( length(x)==nrow(logfc.sign) ){
      sapply( names(x), function(y) sum(x[[y]] %in% ts.list$ctrl[[y]])/length(x[[y]]) )
    } else {
      sapply( names(x), function(y) sum(x[[y]] %in% ts.list$splicing[[y]])/length(x[[y]]) )
    }
  })
  
  # output
  
  return(list(n=n,ratios=ratios))
}

plots HeLa

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##       32.37133       32.88256       16.89632       16.74532
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##             33             33             17             17

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##     0.12986543     0.12855534     0.06738572     0.06554005
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##      0.1212121      0.1176471      0.0000000      0.0000000

plots HEK

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##       23.01278       23.33227       11.72396       11.86510
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##             23             24             12             12

##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##     0.12986543     0.12855534     0.06738572     0.06554005
##        spliced      unspliced   ctrl.spliced ctrl.unspliced 
##      0.1212121      0.1176471      0.0000000      0.0000000

Summary: As expected, there seem to be genes that are similarly upregulated over a majority of the treatment miRNA. They thus skew the control samples we created (average counts over all treatments for each gene). To eliminate this distortion we exclude these genes from the datasets.